In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.cross_validation  import train_test_split
from sklearn.metrics import classification_report
from tools import tester
from tools.tester import dump_classifier_and_data
from sklearn import preprocessing
from sklearn.svm import SVC

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
Finding POI's in the Enron Dataset

Understanding the Dataset and Question

Data Exploration

Dataset Contents

final_project_dataset.pkl Financial data from the included enron61712insiderpay.pdf have been combined into a dictionary in the included final_project_dataset.pkl file. In the dictionary, the key is the person's name, and the value is another dictionary, which contains the names of all the features and their values for that person. The features in the data fall into three major types, namely financial features, email features and POI labels.

financial features:

# (all units are in US dollars)
[
 'salary',
 'deferral_payments',
 'total_payments',
 'loan_advances',
 'bonus',
 'restricted_stock_deferred',
 'deferred_income',
 'total_stock_value',
 'expenses',
 'exercised_stock_options',
 'other',
 'long_term_incentive',
 'restricted_stock',
 'director_fees'
]

email features:

# (units are generally number of emails messages that reference the ; notable exception is ‘email_address’, which is a text string)
[
 'to_messages',
 'email_address',
 'from_poi_to_this_person',
 'from_messages',
 'from_this_person_to_poi',
 'poi', # POI Label (boolean, represented as integer).
 'shared_receipt_with_poi'
]

Persons of Interest


In [2]:
cat ./poi_names.txt


http://usatoday30.usatoday.com/money/industries/energy/2005-12-28-enron-participants_x.htm
First column indicates whether or not the email inbox can be found in the Enron dataset

(y) Lay, Kenneth
(y) Skilling, Jeffrey
(n) Howard, Kevin
(n) Krautz, Michael
(n) Yeager, Scott
(n) Hirko, Joseph
(n) Shelby, Rex
(n) Bermingham, David
(n) Darby, Giles
(n) Mulgrew, Gary
(n) Bayley, Daniel
(n) Brown, James
(n) Furst, Robert
(n) Fuhs, William
(n) Causey, Richard
(n) Calger, Christopher
(n) DeSpain, Timothy
(n) Hannon, Kevin
(n) Koenig, Mark
(y) Forney, John
(n) Rice, Kenneth
(n) Rieker, Paula
(n) Fastow, Lea
(n) Fastow, Andrew
(y) Delainey, David
(n) Glisan, Ben
(n) Richter, Jeffrey
(n) Lawyer, Larry
(n) Belden, Timothy
(n) Kopper, Michael
(n) Duncan, David
(n) Bowen, Raymond
(n) Colwell, Wesley
(n) Boyle, Dan
(n) Loehr, Christopher

This file contains a list of 35 people who were a person of interest in the Enron scandal. A POI is defined as someone who was:

  • indicted
  • settled without admitting guilt
  • testified in exchange for immunity

Summary of Dataset


In [3]:
enron_data = pickle.load(open("./final_project_dataset.pkl"))

In [4]:
enron_data.iteritems().next()


Out[4]:
('METTS MARK',
 {'bonus': 600000,
  'deferral_payments': 'NaN',
  'deferred_income': 'NaN',
  'director_fees': 'NaN',
  'email_address': 'mark.metts@enron.com',
  'exercised_stock_options': 'NaN',
  'expenses': 94299,
  'from_messages': 29,
  'from_poi_to_this_person': 38,
  'from_this_person_to_poi': 1,
  'loan_advances': 'NaN',
  'long_term_incentive': 'NaN',
  'other': 1740,
  'poi': False,
  'restricted_stock': 585062,
  'restricted_stock_deferred': 'NaN',
  'salary': 365788,
  'shared_receipt_with_poi': 702,
  'to_messages': 807,
  'total_payments': 1061827,
  'total_stock_value': 585062})

Features in the dataset:

In the dataset there are 146 employees with 20 features and 'poi' label for each employee. To further summarize the features in the dataset, we will convert the dictionary to a pandas DataFrame.


In [5]:
# Replace "Nan" with NaN
for columns in enron_data.itervalues():
    for k,v in columns.iteritems():
        if type(v) is str and v.lower() == "nan":
            columns[k] = np.nan

In [6]:
enron_df = pd.DataFrame.from_dict(enron_data, orient="index")
enron_df


Out[6]:
salary to_messages deferral_payments total_payments exercised_stock_options bonus restricted_stock shared_receipt_with_poi restricted_stock_deferred total_stock_value expenses loan_advances from_messages other from_this_person_to_poi poi director_fees deferred_income long_term_incentive email_address from_poi_to_this_person
ALLEN PHILLIP K 201955.0 2902.0 2869717.0 4484442.0 1729541.0 4175000.0 126027.0 1407.0 -126027.0 1729541.0 13868.0 NaN 2195.0 152.0 65.0 False NaN -3081055.0 304805.0 phillip.allen@enron.com 47.0
BADUM JAMES P NaN NaN 178980.0 182466.0 257817.0 NaN NaN NaN NaN 257817.0 3486.0 NaN NaN NaN NaN False NaN NaN NaN NaN NaN
BANNANTINE JAMES M 477.0 566.0 NaN 916197.0 4046157.0 NaN 1757552.0 465.0 -560222.0 5243487.0 56301.0 NaN 29.0 864523.0 0.0 False NaN -5104.0 NaN james.bannantine@enron.com 39.0
BAXTER JOHN C 267102.0 NaN 1295738.0 5634343.0 6680544.0 1200000.0 3942714.0 NaN NaN 10623258.0 11200.0 NaN NaN 2660303.0 NaN False NaN -1386055.0 1586055.0 NaN NaN
BAY FRANKLIN R 239671.0 NaN 260455.0 827696.0 NaN 400000.0 145796.0 NaN -82782.0 63014.0 129142.0 NaN NaN 69.0 NaN False NaN -201641.0 NaN frank.bay@enron.com NaN
BAZELIDES PHILIP J 80818.0 NaN 684694.0 860136.0 1599641.0 NaN NaN NaN NaN 1599641.0 NaN NaN NaN 874.0 NaN False NaN NaN 93750.0 NaN NaN
BECK SALLY W 231330.0 7315.0 NaN 969068.0 NaN 700000.0 126027.0 2639.0 NaN 126027.0 37172.0 NaN 4343.0 566.0 386.0 False NaN NaN NaN sally.beck@enron.com 144.0
BELDEN TIMOTHY N 213999.0 7991.0 2144013.0 5501630.0 953136.0 5249999.0 157569.0 5521.0 NaN 1110705.0 17355.0 NaN 484.0 210698.0 108.0 True NaN -2334434.0 NaN tim.belden@enron.com 228.0
BELFER ROBERT NaN NaN -102500.0 102500.0 3285.0 NaN NaN NaN 44093.0 -44093.0 NaN NaN NaN NaN NaN False 3285.0 NaN NaN NaN NaN
BERBERIAN DAVID 216582.0 NaN NaN 228474.0 1624396.0 NaN 869220.0 NaN NaN 2493616.0 11892.0 NaN NaN NaN NaN False NaN NaN NaN david.berberian@enron.com NaN
BERGSIEKER RICHARD P 187922.0 383.0 NaN 618850.0 NaN 250000.0 659249.0 233.0 NaN 659249.0 59175.0 NaN 59.0 427316.0 0.0 False NaN -485813.0 180250.0 rick.bergsieker@enron.com 4.0
BHATNAGAR SANJAY NaN 523.0 NaN 15456290.0 2604490.0 NaN -2604490.0 463.0 15456290.0 NaN NaN NaN 29.0 137864.0 1.0 False 137864.0 NaN NaN sanjay.bhatnagar@enron.com 0.0
BIBI PHILIPPE A 213625.0 1607.0 NaN 2047593.0 1465734.0 1000000.0 378082.0 1336.0 NaN 1843816.0 38559.0 NaN 40.0 425688.0 8.0 False NaN NaN 369721.0 philippe.bibi@enron.com 23.0
BLACHMAN JEREMY M 248546.0 2475.0 NaN 2014835.0 765313.0 850000.0 189041.0 2326.0 NaN 954354.0 84208.0 NaN 14.0 272.0 2.0 False NaN NaN 831809.0 jeremy.blachman@enron.com 25.0
BLAKE JR. NORMAN P NaN NaN NaN 1279.0 NaN NaN NaN NaN NaN NaN 1279.0 NaN NaN NaN NaN False 113784.0 -113784.0 NaN NaN NaN
BOWEN JR RAYMOND M 278601.0 1858.0 NaN 2669589.0 NaN 1350000.0 252055.0 1593.0 NaN 252055.0 65907.0 NaN 27.0 1621.0 15.0 True NaN -833.0 974293.0 raymond.bowen@enron.com 140.0
BROWN MICHAEL NaN 1486.0 NaN 49288.0 NaN NaN NaN 761.0 NaN NaN 49288.0 NaN 41.0 NaN 1.0 False NaN NaN NaN michael.brown@enron.com 13.0
BUCHANAN HAROLD G 248017.0 1088.0 NaN 1054637.0 825464.0 500000.0 189041.0 23.0 NaN 1014505.0 600.0 NaN 125.0 1215.0 0.0 False NaN NaN 304805.0 john.buchanan@enron.com 0.0
BUTTS ROBERT H 261516.0 NaN NaN 1271582.0 NaN 750000.0 417619.0 NaN NaN 417619.0 9410.0 NaN NaN 150656.0 NaN False NaN -75000.0 175000.0 bob.butts@enron.com NaN
BUY RICHARD B 330546.0 3523.0 649584.0 2355702.0 2542813.0 900000.0 901657.0 2333.0 NaN 3444470.0 NaN NaN 1053.0 400572.0 71.0 False NaN -694862.0 769862.0 rick.buy@enron.com 156.0
CALGER CHRISTOPHER F 240189.0 2598.0 NaN 1639297.0 NaN 1250000.0 126027.0 2188.0 NaN 126027.0 35818.0 NaN 144.0 486.0 25.0 True NaN -262500.0 375304.0 christopher.calger@enron.com 199.0
CARTER REBECCA C 261809.0 312.0 NaN 477557.0 NaN 300000.0 307301.0 196.0 -307301.0 NaN NaN NaN 15.0 540.0 7.0 False NaN -159792.0 75000.0 rebecca.carter@enron.com 29.0
CAUSEY RICHARD A 415189.0 1892.0 NaN 1868758.0 NaN 1000000.0 2502063.0 1585.0 NaN 2502063.0 30674.0 NaN 49.0 307895.0 12.0 True NaN -235000.0 350000.0 richard.causey@enron.com 58.0
CHAN RONNIE NaN NaN NaN NaN NaN NaN 32460.0 NaN -32460.0 NaN NaN NaN NaN NaN NaN False 98784.0 -98784.0 NaN NaN NaN
CHRISTODOULOU DIOMEDES NaN NaN NaN NaN 5127155.0 NaN 950730.0 NaN NaN 6077885.0 NaN NaN NaN NaN NaN False NaN NaN NaN diomedes.christodoulou@enron.com NaN
CLINE KENNETH W NaN NaN NaN NaN NaN NaN 662086.0 NaN -472568.0 189518.0 NaN NaN NaN NaN NaN False NaN NaN NaN NaN NaN
COLWELL WESLEY 288542.0 1758.0 27610.0 1490344.0 NaN 1200000.0 698242.0 1132.0 NaN 698242.0 16514.0 NaN 40.0 101740.0 11.0 True NaN -144062.0 NaN wes.colwell@enron.com 240.0
CORDES WILLIAM R NaN 764.0 NaN NaN 651850.0 NaN 386335.0 58.0 NaN 1038185.0 NaN NaN 12.0 NaN 0.0 False NaN NaN NaN bill.cordes@enron.com 10.0
COX DAVID 314288.0 102.0 NaN 1101393.0 117551.0 800000.0 378082.0 71.0 NaN 495633.0 27861.0 NaN 33.0 494.0 4.0 False NaN -41250.0 NaN chip.cox@enron.com 0.0
CUMBERLAND MICHAEL S 184899.0 NaN NaN 807956.0 NaN 325000.0 207940.0 NaN NaN 207940.0 22344.0 NaN NaN 713.0 NaN False NaN NaN 275000.0 NaN NaN
DEFFNER JOSEPH M 206121.0 714.0 NaN 1208649.0 17378.0 600000.0 141833.0 552.0 NaN 159211.0 41626.0 NaN 74.0 25553.0 4.0 False NaN NaN 335349.0 joseph.deffner@enron.com 115.0
DELAINEY DAVID W 365163.0 3093.0 NaN 4747979.0 2291113.0 3000000.0 1323148.0 2097.0 NaN 3614261.0 86174.0 NaN 3069.0 1661.0 609.0 True NaN NaN 1294981.0 david.delainey@enron.com 66.0
DERRICK JR. JAMES V 492375.0 2181.0 NaN 550981.0 8831913.0 800000.0 1787380.0 1401.0 -1787380.0 8831913.0 51124.0 NaN 909.0 7482.0 20.0 False NaN -1284000.0 484000.0 james.derrick@enron.com 64.0
DETMERING TIMOTHY J 210500.0 NaN 875307.0 1204583.0 2027865.0 425000.0 315068.0 NaN -315068.0 2027865.0 52255.0 NaN NaN 1105.0 NaN False NaN -775241.0 415657.0 timothy.detmering@enron.com NaN
DIETRICH JANET R 250100.0 2572.0 NaN 1410464.0 1550019.0 600000.0 315068.0 1902.0 NaN 1865087.0 3475.0 NaN 63.0 473.0 14.0 False NaN NaN 556416.0 janet.dietrich@enron.com 305.0
DIMICHELE RICHARD G 262788.0 NaN NaN 2368151.0 8191755.0 1000000.0 126027.0 NaN NaN 8317782.0 35812.0 NaN NaN 374689.0 NaN False NaN NaN 694862.0 richard.dimichele@enron.com NaN
DODSON KEITH 221003.0 176.0 NaN 319941.0 NaN 70000.0 NaN 114.0 NaN NaN 28164.0 NaN 14.0 774.0 3.0 False NaN NaN NaN keith.dodson@enron.com 10.0
DONAHUE JR JEFFREY M 278601.0 865.0 NaN 875760.0 765920.0 800000.0 315068.0 772.0 NaN 1080988.0 96268.0 NaN 22.0 891.0 11.0 False NaN -300000.0 NaN jeff.donahue@enron.com 188.0
DUNCAN JOHN H NaN NaN NaN 77492.0 371750.0 NaN NaN NaN NaN 371750.0 NaN NaN NaN NaN NaN False 102492.0 -25000.0 NaN NaN NaN
DURAN WILLIAM D 210692.0 904.0 NaN 2093263.0 1451869.0 750000.0 189041.0 599.0 NaN 1640910.0 25785.0 NaN 12.0 1568.0 3.0 False NaN NaN 1105218.0 w.duran@enron.com 106.0
ECHOLS JOHN B 182245.0 NaN NaN 2692324.0 601438.0 200000.0 407503.0 NaN NaN 1008941.0 21530.0 NaN NaN 53775.0 NaN False NaN NaN 2234774.0 john.echols@enron.com NaN
ELLIOTT STEVEN 170941.0 NaN NaN 211725.0 4890344.0 350000.0 1788391.0 NaN NaN 6678735.0 78552.0 NaN NaN 12961.0 NaN False NaN -400729.0 NaN steven.elliott@enron.com NaN
FALLON JAMES B 304588.0 1755.0 NaN 3676340.0 940257.0 2500000.0 1392142.0 1604.0 NaN 2332399.0 95924.0 NaN 75.0 401481.0 37.0 False NaN NaN 374347.0 jim.fallon@enron.com 42.0
FASTOW ANDREW S 440698.0 NaN NaN 2424083.0 NaN 1300000.0 1794412.0 NaN NaN 1794412.0 55921.0 NaN NaN 277464.0 NaN True NaN -1386055.0 1736055.0 andrew.fastow@enron.com NaN
FITZGERALD JAY L 199157.0 936.0 NaN 1414857.0 664461.0 350000.0 956775.0 723.0 NaN 1621236.0 23870.0 NaN 16.0 285414.0 8.0 False NaN NaN 556416.0 jay.fitzgerald@enron.com 1.0
FOWLER PEGGY NaN 517.0 NaN NaN 1324578.0 NaN 560170.0 10.0 NaN 1884748.0 NaN NaN 36.0 NaN 0.0 False NaN NaN NaN kulvinder.fowler@enron.com 0.0
FOY JOE NaN 57.0 181755.0 181755.0 343434.0 NaN NaN 2.0 NaN 343434.0 NaN NaN 13.0 NaN 0.0 False NaN NaN NaN tracy.foy@enron.com 0.0
FREVERT MARK A 1060932.0 3275.0 6426990.0 17252530.0 10433518.0 2000000.0 4188667.0 2979.0 NaN 14622185.0 86987.0 2000000.0 21.0 7427621.0 6.0 False NaN -3367011.0 1617011.0 mark.frevert@enron.com 242.0
FUGH JOHN L NaN NaN 50591.0 50591.0 176378.0 NaN NaN NaN NaN 176378.0 NaN NaN NaN NaN NaN False NaN NaN NaN NaN NaN
GAHN ROBERT S 192008.0 NaN 73122.0 900585.0 83237.0 509870.0 235370.0 NaN NaN 318607.0 50080.0 NaN NaN 76547.0 NaN False NaN -1042.0 NaN NaN NaN
GARLAND C KEVIN 231946.0 209.0 NaN 1566469.0 636246.0 850000.0 259907.0 178.0 NaN 896153.0 48405.0 NaN 44.0 60814.0 27.0 False NaN NaN 375304.0 kevin.garland@enron.com 10.0
GATHMANN WILLIAM D NaN NaN NaN NaN 1753766.0 NaN 264013.0 NaN -72419.0 1945360.0 NaN NaN NaN NaN NaN False NaN NaN NaN NaN NaN
GIBBS DANA R NaN 169.0 504610.0 966522.0 2218275.0 NaN NaN 23.0 NaN 2218275.0 NaN NaN 12.0 NaN 0.0 False NaN NaN 461912.0 dana.gibbs@enron.com 0.0
GILLIS JOHN NaN NaN NaN NaN 9803.0 NaN 75838.0 NaN NaN 85641.0 NaN NaN NaN NaN NaN False NaN NaN NaN NaN NaN
GLISAN JR BEN F 274975.0 873.0 NaN 1272284.0 384728.0 600000.0 393818.0 874.0 NaN 778546.0 125978.0 NaN 16.0 200308.0 6.0 True NaN NaN 71023.0 ben.glisan@enron.com 52.0
GOLD JOSEPH 272880.0 NaN NaN 2146973.0 436515.0 750000.0 441096.0 NaN NaN 877611.0 NaN NaN NaN 819288.0 NaN False NaN NaN 304805.0 joe.gold@enron.com NaN
GRAMM WENDY L NaN NaN NaN 119292.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN False 119292.0 NaN NaN NaN NaN
GRAY RODNEY 6615.0 NaN 93585.0 1146658.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN 680833.0 NaN False NaN NaN 365625.0 NaN NaN
HAEDICKE MARK E 374125.0 4009.0 2157527.0 3859065.0 608750.0 1150000.0 524169.0 1847.0 -329825.0 803094.0 76169.0 NaN 1941.0 52382.0 61.0 False NaN -934484.0 983346.0 mark.haedicke@enron.com 180.0
HANNON KEVIN P 243293.0 1045.0 NaN 288682.0 5538001.0 1500000.0 853064.0 1035.0 NaN 6391065.0 34039.0 NaN 32.0 11350.0 21.0 True NaN -3117011.0 1617011.0 kevin.hannon@enron.com 32.0
HAUG DAVID L NaN 573.0 NaN 475.0 NaN NaN 2217299.0 471.0 NaN 2217299.0 475.0 NaN 19.0 NaN 7.0 False NaN NaN NaN david.haug@enron.com 4.0
HAYES ROBERT E NaN 504.0 7961.0 7961.0 NaN NaN 151418.0 50.0 NaN 151418.0 NaN NaN 12.0 NaN 0.0 False NaN NaN NaN robert.hayes@enron.com 16.0
HAYSLETT RODERICK J NaN 2649.0 NaN NaN NaN NaN 346663.0 571.0 NaN 346663.0 NaN NaN 1061.0 NaN 38.0 False NaN NaN NaN rod.hayslett@enron.com 35.0
HERMANN ROBERT J 262663.0 NaN NaN 1297461.0 187500.0 700000.0 480632.0 NaN NaN 668132.0 48357.0 NaN NaN 416441.0 NaN False NaN -280000.0 150000.0 robert.hermann@enron.com NaN
HICKERSON GARY J 211788.0 1320.0 NaN 2081796.0 NaN 1700000.0 441096.0 900.0 NaN 441096.0 98849.0 NaN 27.0 1936.0 1.0 False NaN NaN 69223.0 gary.hickerson@enron.com 40.0
HIRKO JOSEPH NaN NaN 10259.0 91093.0 30766064.0 NaN NaN NaN NaN 30766064.0 77978.0 NaN NaN 2856.0 NaN True NaN NaN NaN joe.hirko@enron.com NaN
HORTON STANLEY C NaN 2350.0 3131860.0 3131860.0 5210569.0 NaN 2046079.0 1074.0 NaN 7256648.0 NaN NaN 1073.0 NaN 15.0 False NaN NaN NaN stanley.horton@enron.com 44.0
HUGHES JAMES A NaN 719.0 NaN NaN 754966.0 NaN 363428.0 589.0 NaN 1118394.0 NaN NaN 34.0 NaN 5.0 False NaN NaN NaN james.hughes@enron.com 35.0
HUMPHREY GENE E 130724.0 128.0 2964506.0 3100224.0 2282768.0 NaN NaN 119.0 NaN 2282768.0 4994.0 NaN 17.0 NaN 17.0 False NaN NaN NaN gene.humphrey@enron.com 10.0
IZZO LAWRENCE L 85274.0 496.0 NaN 1979596.0 2165172.0 NaN 3654808.0 437.0 NaN 5819980.0 28093.0 NaN 19.0 1553729.0 5.0 False NaN NaN 312500.0 larry.izzo@enron.com 28.0
JACKSON CHARLENE R 288558.0 258.0 NaN 551174.0 185063.0 250000.0 540672.0 117.0 NaN 725735.0 10181.0 NaN 56.0 2435.0 19.0 False NaN NaN NaN charlene.jackson@enron.com 25.0
JAEDICKE ROBERT NaN NaN NaN 83750.0 431750.0 NaN 44093.0 NaN -44093.0 431750.0 NaN NaN NaN NaN NaN False 108750.0 -25000.0 NaN NaN NaN
KAMINSKI WINCENTY J 275101.0 4607.0 NaN 1086821.0 850010.0 400000.0 126027.0 583.0 NaN 976037.0 83585.0 NaN 14368.0 4669.0 171.0 False NaN NaN 323466.0 vince.kaminski@enron.com 41.0
KEAN STEVEN J 404338.0 12754.0 NaN 1747522.0 2022048.0 1000000.0 4131594.0 3639.0 NaN 6153642.0 41953.0 NaN 6759.0 1231.0 387.0 False NaN NaN 300000.0 steven.kean@enron.com 140.0
KISHKILL JOSEPH G 174246.0 NaN NaN 704896.0 NaN NaN 1034346.0 NaN NaN 1034346.0 116335.0 NaN NaN 465357.0 NaN False NaN -51042.0 NaN joe.kishkill@enron.com NaN
KITCHEN LOUISE 271442.0 8305.0 NaN 3471141.0 81042.0 3100000.0 466101.0 3669.0 NaN 547143.0 5774.0 NaN 1728.0 93925.0 194.0 False NaN NaN NaN louise.kitchen@enron.com 251.0
KOENIG MARK E 309946.0 2374.0 NaN 1587421.0 671737.0 700000.0 1248318.0 2271.0 NaN 1920055.0 127017.0 NaN 61.0 150458.0 15.0 True NaN NaN 300000.0 mark.koenig@enron.com 53.0
KOPPER MICHAEL J 224305.0 NaN NaN 2652612.0 NaN 800000.0 985032.0 NaN NaN 985032.0 118134.0 NaN NaN 907502.0 NaN True NaN NaN 602671.0 michael.kopper@enron.com NaN
LAVORATO JOHN J 339288.0 7259.0 NaN 10425757.0 4158995.0 8000000.0 1008149.0 3962.0 NaN 5167144.0 49537.0 NaN 2585.0 1552.0 411.0 False NaN NaN 2035380.0 john.lavorato@enron.com 528.0
LAY KENNETH L 1072321.0 4273.0 202911.0 103559793.0 34348384.0 7000000.0 14761694.0 2411.0 NaN 49110078.0 99832.0 81525000.0 36.0 10359729.0 16.0 True NaN -300000.0 3600000.0 kenneth.lay@enron.com 123.0
LEFF DANIEL P 273746.0 2822.0 NaN 2664228.0 NaN 1000000.0 360528.0 2672.0 NaN 360528.0 NaN NaN 63.0 3083.0 14.0 False NaN NaN 1387399.0 dan.leff@enron.com 67.0
LEMAISTRE CHARLES NaN NaN NaN 87492.0 412878.0 NaN NaN NaN NaN 412878.0 NaN NaN NaN NaN NaN False 112492.0 -25000.0 NaN NaN NaN
LEWIS RICHARD NaN 952.0 NaN NaN 850477.0 NaN NaN 739.0 NaN 850477.0 NaN NaN 26.0 NaN 0.0 False NaN NaN NaN richard.lewis@enron.com 10.0
LINDHOLM TOD A 236457.0 NaN 204075.0 875889.0 2549361.0 200000.0 514847.0 NaN NaN 3064208.0 57727.0 NaN NaN 2630.0 NaN False NaN NaN 175000.0 tod.lindholm@enron.com NaN
LOCKHART EUGENE E NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN False NaN NaN NaN NaN NaN
LOWRY CHARLES P NaN NaN NaN NaN 372205.0 NaN 153686.0 NaN -153686.0 372205.0 NaN NaN NaN NaN NaN False NaN NaN NaN NaN NaN
MARTIN AMANDA K 349487.0 1522.0 85430.0 8407016.0 2070306.0 NaN NaN 477.0 NaN 2070306.0 8211.0 NaN 230.0 2818454.0 0.0 False NaN NaN 5145434.0 a..martin@enron.com 8.0
MCCARTY DANNY J NaN 1433.0 NaN NaN 664375.0 NaN 94556.0 508.0 NaN 758931.0 NaN NaN 215.0 NaN 2.0 False NaN NaN NaN danny.mccarty@enron.com 25.0
MCCLELLAN GEORGE 263413.0 1744.0 NaN 1318763.0 506765.0 900000.0 441096.0 1469.0 NaN 947861.0 228763.0 NaN 49.0 51587.0 0.0 False NaN -125000.0 NaN george.mcclellan@enron.com 52.0
MCCONNELL MICHAEL S 365038.0 3329.0 NaN 2101364.0 1623010.0 1100000.0 1478269.0 2189.0 NaN 3101279.0 81364.0 NaN 2742.0 540.0 194.0 False NaN NaN 554422.0 mike.mcconnell@enron.com 92.0
MCDONALD REBECCA NaN 894.0 NaN NaN 757301.0 NaN 934065.0 720.0 NaN 1691366.0 NaN NaN 13.0 NaN 1.0 False NaN NaN NaN rebecca.mcdonald@enron.com 54.0
MCMAHON JEFFREY 370448.0 2355.0 NaN 4099771.0 1104054.0 2600000.0 558801.0 2228.0 NaN 1662855.0 137108.0 NaN 48.0 297353.0 26.0 False NaN NaN 694862.0 jeffrey.mcmahon@enron.com 58.0
MENDELSOHN JOHN NaN NaN NaN 148.0 NaN NaN NaN NaN NaN NaN 148.0 NaN NaN NaN NaN False 103750.0 -103750.0 NaN NaN NaN
METTS MARK 365788.0 807.0 NaN 1061827.0 NaN 600000.0 585062.0 702.0 NaN 585062.0 94299.0 NaN 29.0 1740.0 1.0 False NaN NaN NaN mark.metts@enron.com 38.0
MEYER JEROME J NaN NaN NaN 2151.0 NaN NaN NaN NaN NaN NaN 2151.0 NaN NaN NaN NaN False 38346.0 -38346.0 NaN NaN NaN
MEYER ROCKFORD G NaN 232.0 1848227.0 1848227.0 493489.0 NaN 462384.0 22.0 NaN 955873.0 NaN NaN 28.0 NaN 0.0 False NaN NaN NaN rockford.meyer@enron.com 0.0
MORAN MICHAEL P NaN 672.0 NaN NaN 59539.0 NaN 161602.0 127.0 NaN 221141.0 NaN NaN 19.0 NaN 0.0 False NaN NaN NaN michael.moran@enron.com 0.0
MORDAUNT KRISTINA M 267093.0 NaN NaN 628522.0 NaN 325000.0 208510.0 NaN NaN 208510.0 35018.0 NaN NaN 1411.0 NaN False NaN NaN NaN kristina.mordaunt@enron.com NaN
MULLER MARK S 251654.0 136.0 842924.0 3202070.0 1056320.0 1100000.0 360528.0 114.0 NaN 1416848.0 NaN NaN 16.0 947.0 0.0 False NaN -719000.0 1725545.0 s..muller@enron.com 12.0
MURRAY JULIA H 229284.0 2192.0 NaN 812194.0 400478.0 400000.0 196983.0 395.0 NaN 597461.0 57580.0 NaN 45.0 330.0 2.0 False NaN NaN 125000.0 julia.murray@enron.com 11.0
NOLES JAMES L NaN NaN 774401.0 774401.0 NaN NaN 463261.0 NaN -94556.0 368705.0 NaN NaN NaN NaN NaN False NaN NaN NaN NaN NaN
OLSON CINDY K 329078.0 1184.0 77716.0 1321557.0 1637034.0 750000.0 969729.0 856.0 NaN 2606763.0 63791.0 NaN 52.0 972.0 15.0 False NaN NaN 100000.0 cindy.olson@enron.com 20.0
OVERDYKE JR JERE C 94941.0 NaN NaN 249787.0 5266578.0 NaN 2041016.0 NaN NaN 7307594.0 18834.0 NaN NaN 176.0 NaN False NaN NaN 135836.0 jere.overdyke@enron.com NaN
PAI LOU L 261879.0 NaN NaN 3123383.0 15364167.0 1000000.0 8453763.0 NaN NaN 23817930.0 32047.0 NaN NaN 1829457.0 NaN False NaN NaN NaN lou.pai@enron.com NaN
PEREIRA PAULO V. FERRAZ NaN NaN NaN 27942.0 NaN NaN NaN NaN NaN NaN 27942.0 NaN NaN NaN NaN False 101250.0 -101250.0 NaN NaN NaN
PICKERING MARK R 655037.0 898.0 NaN 1386690.0 28798.0 300000.0 NaN 728.0 NaN 28798.0 31653.0 400000.0 67.0 NaN 0.0 False NaN NaN NaN mark.pickering@enron.com 7.0
PIPER GREGORY F 197091.0 1238.0 1130036.0 1737629.0 880290.0 400000.0 409554.0 742.0 -409554.0 880290.0 43057.0 NaN 222.0 778.0 48.0 False NaN -33333.0 NaN greg.piper@enron.com 61.0
PIRO JIM NaN 58.0 NaN NaN NaN NaN 47304.0 3.0 NaN 47304.0 NaN NaN 16.0 NaN 1.0 False NaN NaN NaN jim.piro@enron.com 0.0
POWERS WILLIAM NaN 653.0 NaN NaN NaN NaN NaN 12.0 NaN NaN NaN NaN 26.0 NaN 0.0 False 17500.0 -17500.0 NaN ken.powers@enron.com 0.0
PRENTICE JAMES NaN NaN 564348.0 564348.0 886231.0 NaN 208809.0 NaN NaN 1095040.0 NaN NaN NaN NaN NaN False NaN NaN NaN james.prentice@enron.com NaN
REDMOND BRIAN L 96840.0 1671.0 NaN 111529.0 7509039.0 NaN 381285.0 1063.0 NaN 7890324.0 14689.0 NaN 221.0 NaN 49.0 False NaN NaN NaN brian.redmond@enron.com 204.0
REYNOLDS LAWRENCE 76399.0 NaN 51365.0 394475.0 4160672.0 100000.0 201483.0 NaN -140264.0 4221891.0 8409.0 NaN NaN 202052.0 NaN False NaN -200000.0 156250.0 NaN NaN
RICE KENNETH D 420636.0 905.0 NaN 505050.0 19794175.0 1750000.0 2748364.0 864.0 NaN 22542539.0 46950.0 NaN 18.0 174839.0 4.0 True NaN -3504386.0 1617011.0 ken.rice@enron.com 42.0
RIEKER PAULA H 249201.0 1328.0 214678.0 1099100.0 1635238.0 700000.0 283649.0 1258.0 NaN 1918887.0 33271.0 NaN 82.0 1950.0 48.0 True NaN -100000.0 NaN paula.rieker@enron.com 35.0
SAVAGE FRANK NaN NaN NaN 3750.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN False 125034.0 -121284.0 NaN NaN NaN
SCRIMSHAW MATTHEW NaN NaN NaN NaN 759557.0 NaN NaN NaN NaN 759557.0 NaN NaN NaN NaN NaN False NaN NaN NaN matthew.scrimshaw@enron.com NaN
SHANKMAN JEFFREY A 304110.0 3221.0 NaN 3038702.0 1441898.0 2000000.0 630137.0 1730.0 NaN 2072035.0 178979.0 NaN 2681.0 1191.0 83.0 False NaN NaN 554422.0 jeffrey.shankman@enron.com 94.0
SHAPIRO RICHARD S 269076.0 15149.0 NaN 1057548.0 607837.0 650000.0 379164.0 4527.0 NaN 987001.0 137767.0 NaN 1215.0 705.0 65.0 False NaN NaN NaN richard.shapiro@enron.com 74.0
SHARP VICTORIA T 248146.0 3136.0 187469.0 1576511.0 281073.0 600000.0 213063.0 2477.0 NaN 494136.0 116337.0 NaN 136.0 2401.0 6.0 False NaN NaN 422158.0 vicki.sharp@enron.com 24.0
SHELBY REX 211844.0 225.0 NaN 2003885.0 1624396.0 200000.0 869220.0 91.0 NaN 2493616.0 22884.0 NaN 39.0 1573324.0 14.0 True NaN -4167.0 NaN rex.shelby@enron.com 13.0
SHERRICK JEFFREY B NaN 613.0 NaN NaN 1426469.0 NaN 405999.0 583.0 NaN 1832468.0 NaN NaN 25.0 NaN 18.0 False NaN NaN NaN jeffrey.sherrick@enron.com 39.0
SHERRIFF JOHN R 428780.0 3187.0 NaN 4335388.0 1835558.0 1500000.0 1293424.0 2103.0 NaN 3128982.0 NaN NaN 92.0 1852186.0 23.0 False NaN NaN 554422.0 john.sherriff@enron.com 28.0
SKILLING JEFFREY K 1111258.0 3627.0 NaN 8682716.0 19250000.0 5600000.0 6843672.0 2042.0 NaN 26093672.0 29336.0 NaN 108.0 22122.0 30.0 True NaN NaN 1920000.0 jeff.skilling@enron.com 88.0
STABLER FRANK 239502.0 NaN NaN 1112087.0 NaN 500000.0 511734.0 NaN NaN 511734.0 16514.0 NaN NaN 356071.0 NaN False NaN NaN NaN frank.stabler@enron.com NaN
SULLIVAN-SHAKLOVITZ COLLEEN 162779.0 NaN 181993.0 999356.0 1362375.0 100000.0 NaN NaN NaN 1362375.0 NaN NaN NaN 162.0 NaN False NaN NaN 554422.0 NaN NaN
SUNDE MARTIN 257486.0 2647.0 NaN 1545059.0 NaN 700000.0 698920.0 2565.0 NaN 698920.0 NaN NaN 38.0 111122.0 13.0 False NaN NaN 476451.0 marty.sunde@enron.com 37.0
TAYLOR MITCHELL S 265214.0 533.0 227449.0 1092663.0 3181250.0 600000.0 563798.0 300.0 NaN 3745048.0 NaN NaN 29.0 NaN 0.0 False NaN NaN NaN mitchell.taylor@enron.com 0.0
THE TRAVEL AGENCY IN THE PARK NaN NaN NaN 362096.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN 362096.0 NaN False NaN NaN NaN NaN NaN
THORN TERENCE H 222093.0 266.0 16586.0 911453.0 4452476.0 NaN 365320.0 73.0 NaN 4817796.0 46145.0 NaN 41.0 426629.0 0.0 False NaN NaN 200000.0 terence.thorn@enron.com 0.0
TILNEY ELIZABETH A 247338.0 460.0 NaN 399393.0 591250.0 300000.0 576792.0 379.0 NaN 1168042.0 NaN NaN 19.0 152055.0 11.0 False NaN -575000.0 275000.0 elizabeth.tilney@enron.com 10.0
TOTAL 26704229.0 NaN 32083396.0 309886585.0 311764000.0 97343619.0 130322299.0 NaN -7576788.0 434509511.0 5235198.0 83925000.0 NaN 42667589.0 NaN False 1398517.0 -27992891.0 48521928.0 NaN NaN
UMANOFF ADAM S 288589.0 111.0 NaN 1130461.0 NaN 788750.0 NaN 41.0 NaN NaN 53122.0 NaN 18.0 NaN 0.0 False NaN NaN NaN adam.umanoff@enron.com 12.0
URQUHART JOHN A NaN NaN NaN 228656.0 NaN NaN NaN NaN NaN NaN 228656.0 NaN NaN NaN NaN False 36666.0 -36666.0 NaN NaN NaN
WAKEHAM JOHN NaN NaN NaN 213071.0 NaN NaN NaN NaN NaN NaN 103773.0 NaN NaN NaN NaN False 109298.0 NaN NaN NaN NaN
WALLS JR ROBERT H 357091.0 671.0 NaN 1798780.0 4346544.0 850000.0 1552453.0 215.0 NaN 5898997.0 50936.0 NaN 146.0 2.0 0.0 False NaN NaN 540751.0 rob.walls@enron.com 17.0
WALTERS GARETH W NaN NaN 53625.0 87410.0 1030329.0 NaN NaN NaN NaN 1030329.0 33785.0 NaN NaN NaN NaN False NaN NaN NaN NaN NaN
WASAFF GEORGE 259996.0 400.0 831299.0 1034395.0 1668260.0 325000.0 388167.0 337.0 NaN 2056427.0 NaN NaN 30.0 1425.0 7.0 False NaN -583325.0 200000.0 george.wasaff@enron.com 22.0
WESTFAHL RICHARD K 63744.0 NaN NaN 762135.0 NaN NaN 384930.0 NaN NaN 384930.0 51870.0 NaN NaN 401130.0 NaN False NaN -10800.0 256191.0 dick.westfahl@enron.com NaN
WHALEY DAVID A NaN NaN NaN NaN 98718.0 NaN NaN NaN NaN 98718.0 NaN NaN NaN NaN NaN False NaN NaN NaN NaN NaN
WHALLEY LAWRENCE G 510364.0 6019.0 NaN 4677574.0 3282960.0 3000000.0 2796177.0 3920.0 NaN 6079137.0 57838.0 NaN 556.0 301026.0 24.0 False NaN NaN 808346.0 greg.whalley@enron.com 186.0
WHITE JR THOMAS E 317543.0 NaN NaN 1934359.0 1297049.0 450000.0 13847074.0 NaN NaN 15144123.0 81353.0 NaN NaN 1085463.0 NaN False NaN NaN NaN thomas.white@enron.com NaN
WINOKUR JR. HERBERT S NaN NaN NaN 84992.0 NaN NaN NaN NaN NaN NaN 1413.0 NaN NaN NaN NaN False 108579.0 -25000.0 NaN NaN NaN
WODRASKA JOHN NaN NaN NaN 189583.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN 189583.0 NaN False NaN NaN NaN john.wodraska@enron.com NaN
WROBEL BRUCE NaN NaN NaN NaN 139130.0 NaN NaN NaN NaN 139130.0 NaN NaN NaN NaN NaN False NaN NaN NaN NaN NaN
YEAGER F SCOTT 158403.0 NaN NaN 360300.0 8308552.0 NaN 3576206.0 NaN NaN 11884758.0 53947.0 NaN NaN 147950.0 NaN True NaN NaN NaN scott.yeager@enron.com NaN
YEAP SOON NaN NaN NaN 55097.0 192758.0 NaN NaN NaN NaN 192758.0 55097.0 NaN NaN NaN NaN False NaN NaN NaN NaN NaN

There is a row with the name, "TOTAL". This row should be removed.


In [7]:
# Omit the TOTAL index
enron_df.drop('TOTAL', inplace=True)

Financial Features


In [8]:
enron_df.loc[:, ['salary',
                 'deferral_payments',
                 'total_payments',
                 'loan_advances',
                 'bonus',
                 'restricted_stock_deferred',
                 'deferred_income',]].describe()


Out[8]:
salary deferral_payments total_payments loan_advances bonus restricted_stock_deferred deferred_income
count 9.400000e+01 3.800000e+01 1.240000e+02 3.000000e+00 8.100000e+01 1.700000e+01 4.800000e+01
mean 2.840875e+05 8.416025e+05 2.623421e+06 2.797500e+07 1.201773e+06 6.218928e+05 -5.810498e+05
std 1.771311e+05 1.289323e+06 9.488106e+06 4.638256e+07 1.441679e+06 3.845528e+06 9.420764e+05
min 4.770000e+02 -1.025000e+05 1.480000e+02 4.000000e+05 7.000000e+04 -1.787380e+06 -3.504386e+06
25% 2.118020e+05 7.964450e+04 3.863802e+05 1.200000e+06 4.250000e+05 -3.298250e+05 -6.112092e+05
50% 2.587410e+05 2.210635e+05 1.100246e+06 2.000000e+06 7.500000e+05 -1.402640e+05 -1.519270e+05
75% 3.086065e+05 8.672112e+05 2.084663e+06 4.176250e+07 1.200000e+06 -7.241900e+04 -3.792600e+04
max 1.111258e+06 6.426990e+06 1.035598e+08 8.152500e+07 8.000000e+06 1.545629e+07 -8.330000e+02

In [9]:
enron_df.loc[:, ['total_stock_value',
                 'expenses',
                 'exercised_stock_options',
                 'other',
                 'long_term_incentive',
                 'restricted_stock',
                 'director_fees']].describe()


Out[9]:
total_stock_value expenses exercised_stock_options other long_term_incentive restricted_stock director_fees
count 1.250000e+02 94.000000 1.010000e+02 9.200000e+01 6.500000e+01 1.090000e+02 16.000000
mean 3.352073e+06 54192.010638 2.959559e+06 4.652767e+05 7.464912e+05 1.147424e+06 89822.875000
std 6.532883e+06 46108.377454 5.499450e+06 1.389719e+06 8.629174e+05 2.249770e+06 41112.700735
min -4.409300e+04 148.000000 3.285000e+03 2.000000e+00 6.922300e+04 -2.604490e+06 3285.000000
25% 4.941360e+05 22479.000000 5.067650e+05 1.209000e+03 2.750000e+05 2.520550e+05 83674.500000
50% 1.095040e+06 46547.500000 1.297049e+06 5.198450e+04 4.221580e+05 4.410960e+05 106164.500000
75% 2.606763e+06 78408.500000 2.542813e+06 3.575772e+05 8.318090e+05 9.850320e+05 112815.000000
max 4.911008e+07 228763.000000 3.434838e+07 1.035973e+07 5.145434e+06 1.476169e+07 137864.000000

Email Features


In [10]:
enron_df.loc[:, ['to_messages',
                 'email_address',
                 'from_poi_to_this_person',
                 'from_messages',
                 'from_this_person_to_poi',
                 'shared_receipt_with_poi']].describe()


Out[10]:
to_messages from_poi_to_this_person from_messages from_this_person_to_poi shared_receipt_with_poi
count 86.000000 86.000000 86.000000 86.000000 86.000000
mean 2073.860465 64.895349 608.790698 41.232558 1176.465116
std 2582.700981 86.979244 1841.033949 100.073111 1178.317641
min 57.000000 0.000000 12.000000 0.000000 2.000000
25% 541.250000 10.000000 22.750000 1.000000 249.750000
50% 1211.000000 35.000000 41.000000 8.000000 740.500000
75% 2634.750000 72.250000 145.500000 24.750000 1888.250000
max 15149.000000 528.000000 14368.000000 609.000000 5521.000000

Persons of Interest


In [11]:
enron_poi = enron_df[enron_df['poi']==True]
print("Number of POI's: " + str(len(enron_poi)))
enron_poi


Number of POI's: 18
Out[11]:
salary to_messages deferral_payments total_payments exercised_stock_options bonus restricted_stock shared_receipt_with_poi restricted_stock_deferred total_stock_value expenses loan_advances from_messages other from_this_person_to_poi poi director_fees deferred_income long_term_incentive email_address from_poi_to_this_person
BELDEN TIMOTHY N 213999.0 7991.0 2144013.0 5501630.0 953136.0 5249999.0 157569.0 5521.0 NaN 1110705.0 17355.0 NaN 484.0 210698.0 108.0 True NaN -2334434.0 NaN tim.belden@enron.com 228.0
BOWEN JR RAYMOND M 278601.0 1858.0 NaN 2669589.0 NaN 1350000.0 252055.0 1593.0 NaN 252055.0 65907.0 NaN 27.0 1621.0 15.0 True NaN -833.0 974293.0 raymond.bowen@enron.com 140.0
CALGER CHRISTOPHER F 240189.0 2598.0 NaN 1639297.0 NaN 1250000.0 126027.0 2188.0 NaN 126027.0 35818.0 NaN 144.0 486.0 25.0 True NaN -262500.0 375304.0 christopher.calger@enron.com 199.0
CAUSEY RICHARD A 415189.0 1892.0 NaN 1868758.0 NaN 1000000.0 2502063.0 1585.0 NaN 2502063.0 30674.0 NaN 49.0 307895.0 12.0 True NaN -235000.0 350000.0 richard.causey@enron.com 58.0
COLWELL WESLEY 288542.0 1758.0 27610.0 1490344.0 NaN 1200000.0 698242.0 1132.0 NaN 698242.0 16514.0 NaN 40.0 101740.0 11.0 True NaN -144062.0 NaN wes.colwell@enron.com 240.0
DELAINEY DAVID W 365163.0 3093.0 NaN 4747979.0 2291113.0 3000000.0 1323148.0 2097.0 NaN 3614261.0 86174.0 NaN 3069.0 1661.0 609.0 True NaN NaN 1294981.0 david.delainey@enron.com 66.0
FASTOW ANDREW S 440698.0 NaN NaN 2424083.0 NaN 1300000.0 1794412.0 NaN NaN 1794412.0 55921.0 NaN NaN 277464.0 NaN True NaN -1386055.0 1736055.0 andrew.fastow@enron.com NaN
GLISAN JR BEN F 274975.0 873.0 NaN 1272284.0 384728.0 600000.0 393818.0 874.0 NaN 778546.0 125978.0 NaN 16.0 200308.0 6.0 True NaN NaN 71023.0 ben.glisan@enron.com 52.0
HANNON KEVIN P 243293.0 1045.0 NaN 288682.0 5538001.0 1500000.0 853064.0 1035.0 NaN 6391065.0 34039.0 NaN 32.0 11350.0 21.0 True NaN -3117011.0 1617011.0 kevin.hannon@enron.com 32.0
HIRKO JOSEPH NaN NaN 10259.0 91093.0 30766064.0 NaN NaN NaN NaN 30766064.0 77978.0 NaN NaN 2856.0 NaN True NaN NaN NaN joe.hirko@enron.com NaN
KOENIG MARK E 309946.0 2374.0 NaN 1587421.0 671737.0 700000.0 1248318.0 2271.0 NaN 1920055.0 127017.0 NaN 61.0 150458.0 15.0 True NaN NaN 300000.0 mark.koenig@enron.com 53.0
KOPPER MICHAEL J 224305.0 NaN NaN 2652612.0 NaN 800000.0 985032.0 NaN NaN 985032.0 118134.0 NaN NaN 907502.0 NaN True NaN NaN 602671.0 michael.kopper@enron.com NaN
LAY KENNETH L 1072321.0 4273.0 202911.0 103559793.0 34348384.0 7000000.0 14761694.0 2411.0 NaN 49110078.0 99832.0 81525000.0 36.0 10359729.0 16.0 True NaN -300000.0 3600000.0 kenneth.lay@enron.com 123.0
RICE KENNETH D 420636.0 905.0 NaN 505050.0 19794175.0 1750000.0 2748364.0 864.0 NaN 22542539.0 46950.0 NaN 18.0 174839.0 4.0 True NaN -3504386.0 1617011.0 ken.rice@enron.com 42.0
RIEKER PAULA H 249201.0 1328.0 214678.0 1099100.0 1635238.0 700000.0 283649.0 1258.0 NaN 1918887.0 33271.0 NaN 82.0 1950.0 48.0 True NaN -100000.0 NaN paula.rieker@enron.com 35.0
SHELBY REX 211844.0 225.0 NaN 2003885.0 1624396.0 200000.0 869220.0 91.0 NaN 2493616.0 22884.0 NaN 39.0 1573324.0 14.0 True NaN -4167.0 NaN rex.shelby@enron.com 13.0
SKILLING JEFFREY K 1111258.0 3627.0 NaN 8682716.0 19250000.0 5600000.0 6843672.0 2042.0 NaN 26093672.0 29336.0 NaN 108.0 22122.0 30.0 True NaN NaN 1920000.0 jeff.skilling@enron.com 88.0
YEAGER F SCOTT 158403.0 NaN NaN 360300.0 8308552.0 NaN 3576206.0 NaN NaN 11884758.0 53947.0 NaN NaN 147950.0 NaN True NaN NaN NaN scott.yeager@enron.com NaN

Number of NaN's:


In [12]:
enron_df.isnull().sum()


Out[12]:
salary                        51
to_messages                   59
deferral_payments            107
total_payments                21
exercised_stock_options       44
bonus                         64
restricted_stock              36
shared_receipt_with_poi       59
restricted_stock_deferred    128
total_stock_value             20
expenses                      51
loan_advances                142
from_messages                 59
other                         53
from_this_person_to_poi       59
poi                            0
director_fees                129
deferred_income               97
long_term_incentive           80
email_address                 34
from_poi_to_this_person       59
dtype: int64

In [13]:
sum(enron_df.isnull().sum())


Out[13]:
1352

Data Exploration Findings

  • 145 people
  • 20 features
  • 18 persons of interest
  • 1352 NaN entries
    • director_fees, loan_advances, and restricted_stock_deferred are the top 3 columns with the most NaN entries

Since there were so many 'NaN' entries, I went back to the pdf that the data was derived from and noticed that entries with '-' were being interpreted as NaN. To fix this I will be replacing the 'NaN's with the value zero.


In [14]:
enron_df.fillna(0, inplace=True)

Outlier Investigation

One outlier that was removed earlier was the "Total" index, which was represented the total sums of each columns. There were two entries that were invalid data points. One was "THE TRAVEL AGENCY IN THE PARK", which can not be a person and definitely not a person of interest. The other invalid entry was "LOCKHART EUGENE E", which had NaN values for all the features.


In [15]:
# Drop email_address column
enron_df.drop('email_address', axis=1, inplace=True)

enron_df.drop("THE TRAVEL AGENCY IN THE PARK", inplace=True)
enron_df.drop("LOCKHART EUGENE E", inplace=True)

Indexes Removed

  • TOTAL
  • THE TRAVEL AGENCY IN THE PARK
  • LOCKHART EUGENE E

The 'email_address' column was also removed since it is irrelevant data.

Optimize Feature Selection/Engineering

Create new features

Do POI's receive more emails from other POI's compared to non POI's?


In [16]:
enron_df['from_poi_ratio'] = enron_df['from_poi_to_this_person'] / enron_df['from_messages']
enron_df.fillna(0, inplace=True)

Do POI's write more emails to other POI's compared to non POI's?


In [17]:
enron_df['to_poi_ratio'] = enron_df['from_this_person_to_poi'] / enron_df['to_messages']
enron_df.fillna(0, inplace=True)

Do POI's have a bigger bonus to salary ratio?


In [18]:
enron_df['bonus_ratio'] = enron_df['bonus'] / enron_df['salary']

In [19]:
enron_df[['poi','bonus_ratio']]


Out[19]:
poi bonus_ratio
ALLEN PHILLIP K False 20.672922
BADUM JAMES P False NaN
BANNANTINE JAMES M False 0.000000
BAXTER JOHN C False 4.492666
BAY FRANKLIN R False 1.668955
BAZELIDES PHILIP J False 0.000000
BECK SALLY W False 3.025980
BELDEN TIMOTHY N True 24.532820
BELFER ROBERT False NaN
BERBERIAN DAVID False 0.000000
BERGSIEKER RICHARD P False 1.330339
BHATNAGAR SANJAY False NaN
BIBI PHILIPPE A False 4.681100
BLACHMAN JEREMY M False 3.419890
BLAKE JR. NORMAN P False NaN
BOWEN JR RAYMOND M True 4.845639
BROWN MICHAEL False NaN
BUCHANAN HAROLD G False 2.015991
BUTTS ROBERT H False 2.867893
BUY RICHARD B False 2.722768
CALGER CHRISTOPHER F True 5.204235
CARTER REBECCA C False 1.145874
CAUSEY RICHARD A True 2.408542
CHAN RONNIE False NaN
CHRISTODOULOU DIOMEDES False NaN
CLINE KENNETH W False NaN
COLWELL WESLEY True 4.158840
CORDES WILLIAM R False NaN
COX DAVID False 2.545436
CUMBERLAND MICHAEL S False 1.757716
DEFFNER JOSEPH M False 2.910912
DELAINEY DAVID W True 8.215509
DERRICK JR. JAMES V False 1.624778
DETMERING TIMOTHY J False 2.019002
DIETRICH JANET R False 2.399040
DIMICHELE RICHARD G False 3.805349
DODSON KEITH False 0.316738
DONAHUE JR JEFFREY M False 2.871490
DUNCAN JOHN H False NaN
DURAN WILLIAM D False 3.559699
ECHOLS JOHN B False 1.097424
ELLIOTT STEVEN False 2.047490
FALLON JAMES B False 8.207809
FASTOW ANDREW S True 2.949866
FITZGERALD JAY L False 1.757407
FOWLER PEGGY False NaN
FOY JOE False NaN
FREVERT MARK A False 1.885135
FUGH JOHN L False NaN
GAHN ROBERT S False 2.655462
GARLAND C KEVIN False 3.664646
GATHMANN WILLIAM D False NaN
GIBBS DANA R False NaN
GILLIS JOHN False NaN
GLISAN JR BEN F True 2.182017
GOLD JOSEPH False 2.748461
GRAMM WENDY L False NaN
GRAY RODNEY False 0.000000
HAEDICKE MARK E False 3.073839
HANNON KEVIN P True 6.165405
HAUG DAVID L False NaN
HAYES ROBERT E False NaN
HAYSLETT RODERICK J False NaN
HERMANN ROBERT J False 2.665012
HICKERSON GARY J False 8.026895
HIRKO JOSEPH True NaN
HORTON STANLEY C False NaN
HUGHES JAMES A False NaN
HUMPHREY GENE E False 0.000000
IZZO LAWRENCE L False 0.000000
JACKSON CHARLENE R False 0.866377
JAEDICKE ROBERT False NaN
KAMINSKI WINCENTY J False 1.454011
KEAN STEVEN J False 2.473178
KISHKILL JOSEPH G False 0.000000
KITCHEN LOUISE False 11.420488
KOENIG MARK E True 2.258458
KOPPER MICHAEL J True 3.566572
LAVORATO JOHN J False 23.578789
LAY KENNETH L True 6.527896
LEFF DANIEL P False 3.653021
LEMAISTRE CHARLES False NaN
LEWIS RICHARD False NaN
LINDHOLM TOD A False 0.845820
LOWRY CHARLES P False NaN
MARTIN AMANDA K False 0.000000
MCCARTY DANNY J False NaN
MCCLELLAN GEORGE False 3.416688
MCCONNELL MICHAEL S False 3.013385
MCDONALD REBECCA False NaN
MCMAHON JEFFREY False 7.018529
MENDELSOHN JOHN False NaN
METTS MARK False 1.640294
MEYER JEROME J False NaN
MEYER ROCKFORD G False NaN
MORAN MICHAEL P False NaN
MORDAUNT KRISTINA M False 1.216805
MULLER MARK S False 4.371081
MURRAY JULIA H False 1.744561
NOLES JAMES L False NaN
OLSON CINDY K False 2.279095
OVERDYKE JR JERE C False 0.000000
PAI LOU L False 3.818557
PEREIRA PAULO V. FERRAZ False NaN
PICKERING MARK R False 0.457989
PIPER GREGORY F False 2.029519
PIRO JIM False NaN
POWERS WILLIAM False NaN
PRENTICE JAMES False NaN
REDMOND BRIAN L False 0.000000
REYNOLDS LAWRENCE False 1.308918
RICE KENNETH D True 4.160367
RIEKER PAULA H True 2.808977
SAVAGE FRANK False NaN
SCRIMSHAW MATTHEW False NaN
SHANKMAN JEFFREY A False 6.576568
SHAPIRO RICHARD S False 2.415674
SHARP VICTORIA T False 2.417931
SHELBY REX True 0.944091
SHERRICK JEFFREY B False NaN
SHERRIFF JOHN R False 3.498297
SKILLING JEFFREY K True 5.039334
STABLER FRANK False 2.087665
SULLIVAN-SHAKLOVITZ COLLEEN False 0.614330
SUNDE MARTIN False 2.718594
TAYLOR MITCHELL S False 2.262324
THORN TERENCE H False 0.000000
TILNEY ELIZABETH A False 1.212915
UMANOFF ADAM S False 2.733126
URQUHART JOHN A False NaN
WAKEHAM JOHN False NaN
WALLS JR ROBERT H False 2.380346
WALTERS GARETH W False NaN
WASAFF GEORGE False 1.250019
WESTFAHL RICHARD K False 0.000000
WHALEY DAVID A False NaN
WHALLEY LAWRENCE G False 5.878158
WHITE JR THOMAS E False 1.417131
WINOKUR JR. HERBERT S False NaN
WODRASKA JOHN False NaN
WROBEL BRUCE False NaN
YEAGER F SCOTT True 0.000000
YEAP SOON False NaN

For NaN values, the labels are more POI than not so these values will be filled with 0, since there seems to be a weak correlation between POI's and a large bonus_ratio.


In [20]:
enron_df.fillna(0, inplace=True)

Feature Engineering Conclusion:

  • Three features were created:
    • The ratio of (emails from poi's sent to the person) to (from received)
    • The ratio of (emails from this person sent to poi's) to (emails sent)
    • The ratio of bonus to salary
  • In the next step feature selection will be done with selectKBest, PCA, and inherently with the feature importance in various tree classifier algorithms.

Classifier Pipelines


In [21]:
# Separate labels and features
enron_df_labels = enron_df['poi']
enron_df_features = enron_df[enron_df.columns.difference(['poi'])]

Baseline Classifier: Gaussian Naive Bayes


In [22]:
pipeline = Pipeline([
        ('kbest', SelectKBest()),
        ('gnb', GaussianNB())])
folds = 100
cv = StratifiedShuffleSplit(enron_df_labels, n_iter= folds, random_state = 42, test_size=0.20)
parameters = {"kbest__k": [1, 2, 3, 5, 8, 13, 19], "kbest__score_func": [f_classif]}
clf = GridSearchCV(pipeline, param_grid=parameters, cv=cv, scoring='f1')
clf.fit(enron_df_features, enron_df_labels)


/Users/wtruong/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/wtruong/anaconda/lib/python2.7/site-packages/sklearn/feature_selection/univariate_selection.py:113: UserWarning: Features [11] are constant.
  UserWarning)
Out[22]:
GridSearchCV(cv=StratifiedShuffleSplit(labels=[False False ...,  True False], n_iter=100, test_size=0.2, random_state=42),
       error_score='raise',
       estimator=Pipeline(steps=[('kbest', SelectKBest(k=10, score_func=<function f_classif at 0x114f330c8>)), ('gnb', GaussianNB())]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'kbest__k': [1, 2, 3, 5, 8, 13, 19], 'kbest__score_func': [<function f_classif at 0x114f330c8>]},
       pre_dispatch='2*n_jobs', refit=True, scoring='f1', verbose=0)

In [23]:
kbest = clf.best_estimator_.steps[0][1]
kbest.get_support()
features = sorted(zip(enron_df_features.columns, kbest.scores_, kbest.get_support()), key=lambda x: x[1])

my_list = [x[0] for x in features if x[2] == True]
my_list = ['poi'] + my_list
my_list


Out[23]:
['poi',
 'deferred_income',
 'salary',
 'bonus',
 'total_stock_value',
 'exercised_stock_options']

In [24]:
data = enron_df[my_list].transpose().to_dict()

In [25]:
dump_classifier_and_data(GaussianNB(), data, my_list)

In [26]:
tester.main()


GaussianNB()
	Accuracy: 0.85464	Precision: 0.48876	Recall: 0.38050	F1: 0.42789	F2: 0.39814
	Total predictions: 14000	True positives:  761	False positives:  796	False negatives: 1239	True negatives: 11204

Using PCA instead of selectKBest:


In [27]:
pipeline = Pipeline([
        ("scale", preprocessing.StandardScaler()),
        ('pca', PCA()),
        ('gnb', GaussianNB())])
folds = 100
cv = StratifiedShuffleSplit(enron_df_labels, n_iter= folds, random_state = 42, test_size=0.20)
parameters = {
    "pca__n_components": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19],
}
clf = GridSearchCV(pipeline, param_grid=parameters, cv=cv, scoring='f1')
clf.fit(enron_df_features, enron_df_labels)


Out[27]:
GridSearchCV(cv=StratifiedShuffleSplit(labels=[False False ...,  True False], n_iter=100, test_size=0.2, random_state=42),
       error_score='raise',
       estimator=Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, n_components=None, whiten=False)), ('gnb', GaussianNB())]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'pca__n_components': [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19]},
       pre_dispatch='2*n_jobs', refit=True, scoring='f1', verbose=0)

In [28]:
pca = clf.best_estimator_.steps[1][1]

In [29]:
pca = clf.best_estimator_.steps[1][1]
pca.n_components


Out[29]:
12

In [30]:
pca_nb = Pipeline([
        ("scale", preprocessing.StandardScaler()),
        ('pca', PCA(n_components=pca.n_components)),
        ('gnb', GaussianNB())])

In [31]:
features_list = list(enron_df.columns)
features_list.remove('poi')
features_list = ['poi'] + features_list

In [32]:
dump_classifier_and_data(pca_nb, enron_df.transpose().to_dict(), features_list)

In [33]:
tester.main()


Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, n_components=12, whiten=False)), ('gnb', GaussianNB())])
	Accuracy: 0.79773	Precision: 0.28075	Recall: 0.33100	F1: 0.30381	F2: 0.31956
	Total predictions: 15000	True positives:  662	False positives: 1696	False negatives: 1338	True negatives: 11304

PCA in our case performs poorly when compared to selectKBest. This indicates that variance is needed in the dataset.

Next Classifier: Decision Tree

With SelectKBest Feature Selection


In [34]:
pipeline = Pipeline([
        ('kbest', SelectKBest()),
        ('dt', DecisionTreeClassifier())])
folds = 100
cv = StratifiedShuffleSplit(enron_df_labels, n_iter= folds, random_state = 42, test_size=0.20)
parameters = {"kbest__k": [1, 2, 3, 5, 8, 13, 19], 'dt__max_features': [None, 'auto', 'log2'],
              'dt__criterion': ['gini', 'entropy']}
clf = GridSearchCV(pipeline, param_grid=parameters, cv=cv, scoring='f1')
clf.fit(enron_df_features, enron_df_labels)


Out[34]:
GridSearchCV(cv=StratifiedShuffleSplit(labels=[False False ...,  True False], n_iter=100, test_size=0.2, random_state=42),
       error_score='raise',
       estimator=Pipeline(steps=[('kbest', SelectKBest(k=10, score_func=<function f_classif at 0x114f330c8>)), ('dt', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'kbest__k': [1, 2, 3, 5, 8, 13, 19], 'dt__criterion': ['gini', 'entropy'], 'dt__max_features': [None, 'auto', 'log2']},
       pre_dispatch='2*n_jobs', refit=True, scoring='f1', verbose=0)

In [35]:
kbest = clf.best_estimator_.steps[0][1]
kbest.get_support()
features = sorted(zip(enron_df_features.columns, kbest.scores_, kbest.get_support()), key=lambda x: x[1])

my_list = [x[0] for x in features if x[2] == True]
my_list = ['poi'] + my_list
my_list


Out[35]:
['poi',
 'to_messages',
 'director_fees',
 'from_this_person_to_poi',
 'to_poi_ratio',
 'other',
 'from_poi_ratio',
 'from_poi_to_this_person',
 'expenses',
 'loan_advances',
 'shared_receipt_with_poi',
 'total_payments',
 'restricted_stock',
 'long_term_incentive',
 'bonus_ratio',
 'deferred_income',
 'salary',
 'bonus',
 'total_stock_value',
 'exercised_stock_options']

In [36]:
clf.best_estimator_.steps[1][1]


Out[36]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='log2', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [37]:
data = enron_df[my_list].transpose().to_dict()
dump_classifier_and_data(clf.best_estimator_.steps[1][1], data, my_list)

In [38]:
tester.main()


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='log2', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
	Accuracy: 0.81080	Precision: 0.29739	Recall: 0.30750	F1: 0.30236	F2: 0.30542
	Total predictions: 15000	True positives:  615	False positives: 1453	False negatives: 1385	True negatives: 11547

Without SelectKBest Feature Selection


In [39]:
pipeline = Pipeline([
        ('dt', DecisionTreeClassifier())])
folds = 100
cv = StratifiedShuffleSplit(enron_df_labels, n_iter= folds, random_state = 42, test_size=0.20)
parameters = {'dt__max_features': [1, 2, 3, 5, 8, 13, 19],
              'dt__criterion': ['gini', 'entropy']}
clf = GridSearchCV(pipeline, param_grid=parameters, cv=cv, scoring='f1')
clf.fit(enron_df_features, enron_df_labels)


Out[39]:
GridSearchCV(cv=StratifiedShuffleSplit(labels=[False False ...,  True False], n_iter=100, test_size=0.2, random_state=42),
       error_score='raise',
       estimator=Pipeline(steps=[('dt', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'dt__criterion': ['gini', 'entropy'], 'dt__max_features': [1, 2, 3, 5, 8, 13, 19]},
       pre_dispatch='2*n_jobs', refit=True, scoring='f1', verbose=0)

In [40]:
data = enron_df.transpose().to_dict()
dump_classifier_and_data(clf.best_estimator_.steps[0][1], data, features_list)

In [41]:
tester.main()


DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=1, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
	Accuracy: 0.82073	Precision: 0.31144	Recall: 0.28450	F1: 0.29736	F2: 0.28951
	Total predictions: 15000	True positives:  569	False positives: 1258	False negatives: 1431	True negatives: 11742

With PCA


In [42]:
pipeline = Pipeline([
        ("scale", preprocessing.StandardScaler()),
        ('pca', PCA()),
        ('dt', DecisionTreeClassifier())])
folds = 100
cv = StratifiedShuffleSplit(enron_df_labels, n_iter= folds, random_state = 42, test_size=0.20)
parameters = {"pca__n_components": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19],
              'dt__max_features': [None, 'auto', 'log2'],
              'dt__criterion': ['gini', 'entropy']}
clf = GridSearchCV(pipeline, param_grid=parameters, cv=cv, scoring='f1')
clf.fit(enron_df_features, enron_df_labels)


Out[42]:
GridSearchCV(cv=StratifiedShuffleSplit(labels=[False False ...,  True False], n_iter=100, test_size=0.2, random_state=42),
       error_score='raise',
       estimator=Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, n_components=None, whiten=False)), ('dt', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'pca__n_components': [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19], 'dt__criterion': ['gini', 'entropy'], 'dt__max_features': [None, 'auto', 'log2']},
       pre_dispatch='2*n_jobs', refit=True, scoring='f1', verbose=0)

In [43]:
pca = clf.best_estimator_.steps[1][1]
pca.n_components


Out[43]:
15

In [44]:
pca_dt = Pipeline([
        ("scale", preprocessing.StandardScaler()),
        ('pca', PCA(n_components=pca.n_components)),
        ('dt', clf.best_estimator_.steps[2][1])])

In [45]:
dump_classifier_and_data(pca_dt, enron_df.transpose().to_dict(), features_list)
tester.main()


Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, n_components=15, whiten=False)), ('dt', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])
	Accuracy: 0.79620	Precision: 0.25269	Recall: 0.27000	F1: 0.26106	F2: 0.26635
	Total predictions: 15000	True positives:  540	False positives: 1597	False negatives: 1460	True negatives: 11403